source('00_util_scripts/mod_seurat.R')
source('00_util_scripts/mod_bplot.R')

ball.path <-
  list.files('~/append-ssd/iacobucci25all/', pattern = 'gz$', full.names = T)

ball.path[1:120]

sobj <- read_geo_supp(ball.path[121:267], name_regex = 'GSM\\d+', workers = 8)

valid.acc <- sobj$orig.ident |> unique()

sobj <- sobj |> PercentageFeatureSet('^MT-', col.name = 'mito.ratio')

sobj |> VlnPlot('mito.ratio', pt.size = 0)

sobj <- sobj |> filter(mito.ratio < 10)

sobj <- sobj |> NormalizeData()

sobj |> write_rds('mission/nudt21_il7r/iaco49.rds')

# remove bad files ----
tibble(path = ball.path, acc = str_extract(path, 'GSM\\d+')) |>
  filter(is.na(acc))

bad.file <- tibble(path = ball.path, acc = str_extract(path, 'GSM\\d+')) |>
  filter(!(acc %in% valid.acc))

file.remove(bad.file$path)

lefov <-
  read_csv('~/append-ssd/iacobucci25all/aria2.iaco.txt', col_names = F) |>
  mutate(acc = str_extract(X1, 'GSM\\d{5,}'))

lefov |>
  filter(!(acc %in% valid.acc)) |>
  mutate(acc = basename(X1)) |>
  write_tsv('~/append-ssd/iacobucci25all/aria2.bad40.txt', col_names = F)

lefov <- tibble(acc = str_c('GSM', 7728352:7728440)) |>
  filter(!(acc %in% valid.acc))

inlist <-
  read_csv('~/append-ssd/iacobucci25all/lftp.iaco.txt', col_names = F) |>
  mutate(acc = str_extract(X1, 'GSM\\d{5,}'))

lefov |>
  filter(!(acc %in% inlist$acc)) |>
  write_csv('~/append-ssd/iacobucci25all/to.lftp.txt', col_names = F)

inlist |>
  filter(acc %in% lefov$acc) |>
  select(-acc) |>
  write_csv('~/append-ssd/iacobucci25all/lftp.iaco.txt', col_names = F)

bad.file <- tibble(path = ball.path, acc = str_extract(path, 'GSM\\d+')) |>
  filter(!(acc %in% valid.acc))

sobj <- read_geo_supp(bad.file$path, name_regex = 'GSM\\d+')

valid.acc <- sobj$orig.ident |> unique()


# average on samples ------------
sobj |> DotPlot(c('NUDT21', 'IL7R'))

exp21.7 <- last_plot() |>
  pluck('data')

exp21.7 <- exp21.7 |>
  as_tibble() |>
  pivot_wider(id_cols = id, names_from = features.plot, values_from = avg.exp) |>
  write_csv('mission/nudt21_il7r/iaco41.89.2gene.csv')

exp21.7 <- read_csv('mission/nudt21_il7r/iaco1.40.2gene.csv') |>
  bind_rows(exp21.7)

exp21.7 |>
  ggplot(aes(NUDT21, IL7R)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  stat_cor() +
  theme_pubr() +
  labs(title = 'Correlation of NUDT21 and IL7R expression in B-ALL cells',
       subtitle = 'GSE241405 (n=89)')

# density on cells? --------


# Almeida 21 NC -------
## proteomics --------
alm.prot <- read_delim('mission/nudt21_il7r/almeida21.protein.txt')

alm.lfq <- alm.prot |> select(`Gene names`, contains('LFQ'))

alm.lfq |>
  filter(`Gene names` %in% c('Nudt21')) |>
  pivot_longer(-1) |>
  mutate(group = case_when(str_detect(name, '1|2|3') ~ 'WT',
                           str_detect(name, '4|5|6') ~ 'Hetero',
                           .default = 'Homo') |>
           fct_relevel('WT', 'Hetero')) |>
  ggplot(aes(group, log(value))) +
  geom_dotgraph +
  stat_summary(geom = 'crossbar', fun = 'mean', width = .5) +
  labs(title = 'Protein abundance of NUDT21 in mice pre/pro-B/leukemic cells',
       x = 'IL7R gain-of-function mutation',
       y = 'Log-normalized protein abundance',
       subtitle = 'PXD017147') +
  theme_pubr()

## RNA ----------
alm.rna <- read_csv('mission/nudt21_il7r/almeida21fig2e.csv')

alm.rna <- tidybulk::ensembl_to_symbol(alm.rna, ...1)

rna2 <- alm.rna |> filter(transcript %in% c('Nudt21', 'Il7r')) |>
  select(-c(...1, ref_genome)) |>
  pivot_longer(contains('SJMM'))

rna2 <- rna2 |>
  mutate(group = str_extract(name, '(BM|SPL).+')) |>
  filter(!str_detect(group, 'SPL')) |>
  mutate(group = case_match(group, 'BM-CTRL' ~ 'CTRL',
                            'BM-Pre' ~ 'IL7R-mut pre-leukemic',
                            'BM-Het' ~ 'IL7R-mut leukemic') |>
           fct_relevel('IL7R-mut leukemic', after = Inf))

rna2 |>
  ggplot(aes(group, value, color = group)) +
  geom_boxplot() +
  facet_wrap(~transcript, scales = 'free_y') +
  stat_compare_means(comparisons = list(c('CTRL', 'IL7R-mut leukemic')),
                     label = 'p.signif') +
  labs(y = 'Normalized RNA level',
       title = 'Mice pre/pro-B/leukemic cells',
       subtitle = 'PRJNA718778') +
  theme_pubr() +
  theme(axis.text.x = element_blank()) +
  scale_color_hue(direction = -1)

rna2 |>
  pivot_wider(names_from = transcript, values_from = value) |>
  ggplot(aes(Il7r, Nudt21)) +
  geom_point(aes(color = group)) +
  geom_smooth(method = 'lm') +
  stat_cor() +
  theme_pubr() +
  labs(title = 'Correlation of Il7r and Nudt21 expression in mice pre+pro-B/leukemic cells',
       subtitle = 'PRJNA718778 (n=19)')

alm.rna2 <- read_csv('mission/nudt21_il7r/almeida21fig6a.csv')

alm.rna2 <- tidybulk::ensembl_to_symbol(alm.rna2, ...1)

homo.rna2 <- alm.rna2 |> filter(transcript %in% c('Nudt21', 'Il7r')) |>
  select(-c(...1, ref_genome)) |>
  pivot_longer(contains('SJMM')) |>
  mutate(group = str_extract(name, 'BM.+'))

homo.rna2 |>
  ggplot(aes(group, value, color = group)) +
  geom_boxplot() +
  facet_wrap(~transcript, scales = 'free_y') +
  stat_compare_means() +
  labs(y = 'Normalized RNA level',
       title = 'Mice leukemic cells',
       subtitle = 'PRJNA718778') +
  theme_pubr() +
  scale_color_hue(direction = -1)

homo.rna2 |>
  pivot_wider(names_from = transcript, values_from = value) |>
  ggplot(aes(Il7r, Nudt21)) +
  geom_point(aes(color = group)) +
  geom_smooth(method = 'lm') +
  stat_cor() +
  theme_pubr() +
  labs(title = 'Correlation of Il7r and Nudt21 expression in mice pre+pro-B/leukemic cells',
       subtitle = 'PRJNA718778 (n=19)')
